import csv
import dis
import inspect
import os
import sys
import time
import arviz as az
import astropy
import astroquery
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pprint
import pymc as pm
import random
import scipy as sp
import scipy.stats as stats
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')
from astropy import units as u
from astropy import constants as const
from astropy.coordinates import SkyCoord
from astropy.io import fits
from astropy.io import ascii
from astropy.time import Time
from astropy.time import TimeDelta
from astropy.timeseries import TimeSeries
from astropy.visualization import time_support
time_support()
from astropy.visualization import quantity_support
quantity_support()
from IPython.display import display_html
from IPython.display import Image
from numpy import interp
from pymc import Model, Normal, Gamma, find_MAP
from scipy import integrate
from scipy import linalg as la
from scipy import optimize
from scipy.stats import beta
from scipy.stats import betabinom
from scipy.stats import binom
from scipy.stats import gamma
from scipy.stats import invgamma
from scipy.stats import multivariate_normal as mvn
from scipy.stats import nbinom
from scipy.stats import norm
from scipy.stats import poisson
from scipy.stats import t
from scipy.stats import uniform
from sklearn.linear_model import LinearRegression as linreg
from sklearn import preprocessing as preproc
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")
pp = pprint.PrettyPrinter(indent=4)
## set seed for reproducibility
random.seed(5731)
## import data
df = pd.read_csv('cepheids.csv')
display(df.describe())
| MV | MV_err | LogP | LogP_err | |
|---|---|---|---|---|
| count | 203.000000 | 203.000000 | 203.000000 | 203.000000 |
| mean | -3.559606 | 0.096749 | 0.797635 | 0.031084 |
| std | 0.799984 | 0.032976 | 0.274683 | 0.020820 |
| min | -6.080000 | 0.080000 | -0.040000 | 0.020000 |
| 25% | -4.030000 | 0.090000 | 0.615000 | 0.020000 |
| 50% | -3.580000 | 0.090000 | 0.780000 | 0.020000 |
| 75% | -3.170000 | 0.100000 | 0.970000 | 0.020000 |
| max | 0.250000 | 0.460000 | 1.620000 | 0.070000 |
Let's visualize the data to see what we're working with.
## plot histogram of magnitude with errors
fig, ax = plt.subplots(figsize=(10,6), facecolor='white')
sns.histplot(data=df, x='MV', kde=True, bins=20, ax=ax)
ax.set_title('Histogram of Cepheid Magnitudes')
plt.show();
## plot histogram of log period with errors
fig, ax = plt.subplots(figsize=(10,6), facecolor='white')
sns.histplot(data=df, x='LogP', kde=True, bins=25, ax=ax)
ax.set_title('Histogram of Cepheid Log Periods')
plt.show();
## plot the log period vs magnitude with errors for both
fig, ax = plt.subplots(figsize=(10,8), facecolor='white')
ax.errorbar(df['LogP'], df['MV'], xerr=df['LogP_err'], yerr=df['MV_err'], fmt='o', color='black')
ax.set_xlabel('Log Period')
ax.set_ylabel('Magnitude')
ax.grid(alpha=0.75)
ax.set_title('Cepheid Log Period vs Magnitude')
plt.show();
Before performing the linear regression with PyMC, let's try the naive approach of fitting a line to the data. We can do this with the np.polyfit function. This function takes the x and y data, and the order of the polynomial to fit. We'll use a first order polynomial, which is a line.
## equation for a line
line = lambda m,x,b: m*x + b
## fit the data with a linear regression with np.polyfit
m, b = np.polyfit(df['LogP'], df['MV'], 1)
estimated_MV = line(m,df['LogP'],b)
## plot the log period vs magnitude with errors for both
fig, ax = plt.subplots(figsize=(10,8), facecolor='white')
ax.errorbar(df['LogP'], df['MV'], xerr=df['LogP_err'], yerr=df['MV_err'], fmt='o', color='black', label='Data')
## now plot the estimated magnitude over top of it
ax.plot(df['LogP'], estimated_MV, color='red', label='Linear Regression')
## styling
ax.set_xlabel('Log Period')
ax.set_ylabel('Magnitude')
ax.grid(alpha=0.75)
ax.set_title('Cepheid Log Period vs Magnitude')
ax.legend()
plt.show();
We can compare this to the results of the mcmc analysis later.
Now, I'll follow the steps outlined in the introductory file to perform the MCMC analysis. In this case, y will be the MV, and x will be LogP (there is only one x in our case)
x = df['LogP']
x_err = df['LogP_err']
y = df['MV']
y_err = df['MV_err']
basic_model = Model()
with basic_model:
## priors for unknown model parameters
Beta = Normal('beta',mu=0,tau=1./10, shape=2)
precision = Gamma('precision', alpha=1, beta=1)
mu = Beta[0] + Beta[1]*x
Y_obs = Normal('Y_obs', mu=mu, tau=precision, observed=y)
map_estimate = find_MAP(model=basic_model)
pp.pprint(map_estimate)
{ 'beta': array([-2.0061211 , -1.94716412]),
'precision': array(2.76414659),
'precision_log__': array(1.01673194)}
# Initialize random number generator
RANDOM_SEED = 5731
rng = np.random.default_rng(RANDOM_SEED)
with basic_model:
start = find_MAP()
# draw 1000 posterior samples
trace = pm.sample( start=start,return_inferencedata=True)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [beta, precision]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 7 seconds.
display(trace)
<xarray.Dataset>
Dimensions: (chain: 4, draw: 1000, beta_dim_0: 2)
Coordinates:
* chain (chain) int64 0 1 2 3
* draw (draw) int64 0 1 2 3 4 5 6 7 ... 992 993 994 995 996 997 998 999
* beta_dim_0 (beta_dim_0) int64 0 1
Data variables:
beta (chain, draw, beta_dim_0) float64 -1.749 -2.209 ... -1.871
precision (chain, draw) float64 2.343 2.476 2.493 ... 3.018 2.402 2.466
Attributes:
created_at: 2022-11-02T11:19:58.856329
arviz_version: 0.12.1
inference_library: pymc
inference_library_version: 4.2.1
sampling_time: 6.896633625030518
tuning_steps: 1000<xarray.Dataset>
Dimensions: (chain: 4, draw: 1000, Y_obs_dim_0: 203)
Coordinates:
* chain (chain) int64 0 1 2 3
* draw (draw) int64 0 1 2 3 4 5 6 7 ... 993 994 995 996 997 998 999
* Y_obs_dim_0 (Y_obs_dim_0) int64 0 1 2 3 4 5 6 ... 197 198 199 200 201 202
Data variables:
Y_obs (chain, draw, Y_obs_dim_0) float64 -0.7198 -0.5837 ... -0.993
Attributes:
created_at: 2022-11-02T11:19:59.460390
arviz_version: 0.12.1
inference_library: pymc
inference_library_version: 4.2.1<xarray.Dataset>
Dimensions: (chain: 4, draw: 1000)
Coordinates:
* chain (chain) int64 0 1 2 3
* draw (draw) int64 0 1 2 3 4 5 6 ... 994 995 996 997 998 999
Data variables: (12/16)
energy_error (chain, draw) float64 0.1591 0.0002092 ... -0.02435
process_time_diff (chain, draw) float64 0.001719 0.002559 ... 0.001123
step_size_bar (chain, draw) float64 0.3273 0.3273 ... 0.3308 0.3308
largest_eigval (chain, draw) float64 nan nan nan nan ... nan nan nan
energy (chain, draw) float64 194.8 192.1 191.7 ... 191.1 190.1
max_energy_error (chain, draw) float64 1.334 -0.22 ... 0.1259 -0.02435
... ...
diverging (chain, draw) bool False False False ... False False
step_size (chain, draw) float64 0.4004 0.4004 ... 0.3715 0.3715
acceptance_rate (chain, draw) float64 0.5756 0.9971 ... 0.9193 0.996
tree_depth (chain, draw) int64 3 3 2 4 2 4 4 4 ... 3 3 3 4 3 2 4 3
perf_counter_start (chain, draw) float64 5.132e+04 5.132e+04 ... 5.132e+04
n_steps (chain, draw) float64 7.0 7.0 3.0 11.0 ... 3.0 11.0 7.0
Attributes:
created_at: 2022-11-02T11:19:58.866146
arviz_version: 0.12.1
inference_library: pymc
inference_library_version: 4.2.1
sampling_time: 6.896633625030518
tuning_steps: 1000<xarray.Dataset>
Dimensions: (Y_obs_dim_0: 203)
Coordinates:
* Y_obs_dim_0 (Y_obs_dim_0) int64 0 1 2 3 4 5 6 ... 197 198 199 200 201 202
Data variables:
Y_obs (Y_obs_dim_0) float64 -3.47 -3.15 -3.33 ... 0.25 -3.46 -4.86
Attributes:
created_at: 2022-11-02T11:19:59.462411
arviz_version: 0.12.1
inference_library: pymc
inference_library_version: 4.2.1posterior = trace.posterior
display(posterior)
<xarray.Dataset>
Dimensions: (chain: 4, draw: 1000, beta_dim_0: 2)
Coordinates:
* chain (chain) int64 0 1 2 3
* draw (draw) int64 0 1 2 3 4 5 6 7 ... 992 993 994 995 996 997 998 999
* beta_dim_0 (beta_dim_0) int64 0 1
Data variables:
beta (chain, draw, beta_dim_0) float64 -1.749 -2.209 ... -1.871
precision (chain, draw) float64 2.343 2.476 2.493 ... 3.018 2.402 2.466
Attributes:
created_at: 2022-11-02T11:19:58.856329
arviz_version: 0.12.1
inference_library: pymc
inference_library_version: 4.2.1
sampling_time: 6.896633625030518
tuning_steps: 1000ts = az.summary(trace)
display(az.summary(trace, round_to=3))
az.plot_trace(trace,figsize=(10, 9));
plt.show()
az.plot_posterior(trace, figsize=(10, 4));
plt.show();
az.plot_autocorr(trace, var_names=['beta'], filter_vars="like", max_lag=200,combined=True,figsize=(10, 4))
plt.show();
| mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
|---|---|---|---|---|---|---|---|---|---|
| beta[0] | -2.002 | 0.130 | -2.229 | -1.747 | 0.003 | 0.002 | 1691.145 | 1682.825 | 1.001 |
| beta[1] | -1.953 | 0.153 | -2.242 | -1.678 | 0.004 | 0.003 | 1675.995 | 1619.767 | 1.002 |
| precision | 2.773 | 0.276 | 2.272 | 3.315 | 0.006 | 0.004 | 2194.811 | 1890.977 | 1.002 |
with basic_model:
pm.sample_posterior_predictive(trace, extend_inferencedata=True)
display(trace.posterior_predictive)
az.plot_ppc(trace, num_pp_samples=100);
Sampling: [Y_obs]
<xarray.Dataset>
Dimensions: (chain: 4, draw: 1000, Y_obs_dim_0: 203)
Coordinates:
* chain (chain) int64 0 1 2 3
* draw (draw) int64 0 1 2 3 4 5 6 7 ... 993 994 995 996 997 998 999
* Y_obs_dim_0 (Y_obs_dim_0) int64 0 1 2 3 4 5 6 ... 197 198 199 200 201 202
Data variables:
Y_obs (chain, draw, Y_obs_dim_0) float64 -2.829 -3.34 ... -4.086
Attributes:
created_at: 2022-11-02T11:20:08.928194
arviz_version: 0.12.1
inference_library: pymc
inference_library_version: 4.2.1Let's compare the posterior predictive sampling to the data.
## equation for a line
line = lambda m,x,b: m*x + b
sortedP = np.linspace(df['LogP'].min()*2,df['LogP'].max()*2,1000)
## fit the data with a linear regression with np.polyfit
m, b = np.polyfit(df['LogP'], df['MV'], 1) ## not sorted here
estimated_MV = line(m,sortedP,b)
## estimate fit with posterior
m_mean = line(ts['mean'][0],sortedP,ts['mean'][1])
m_03 = line(ts['hdi_3%'][0],sortedP,ts['hdi_3%'][1])
m_97 = line(ts['hdi_97%'][0],sortedP,ts['hdi_97%'][1])
## plot the log period vs magnitude with errors for both
fig, ax = plt.subplots(figsize=(10,8), facecolor='white')
ax.errorbar(df['LogP'], df['MV'], xerr=df['LogP_err'], yerr=df['MV_err'],
fmt='o', markersize=3, color='black', alpha=0.9, label='Data')
## plot the 97% confidence interval
ax.fill_between(sortedP, m_03, m_97, color='grey', alpha=0.5, label='Posterior 97% CI')
## now plot the estimated magnitude over top of it
ax.plot(sortedP, estimated_MV,
lw=6, color='teal', label='Linear Regression')
## plot the mean of the posterior distribution
ax.plot(sortedP, m_mean, color='aqua', label='Posterior Mean')
## styling
# ax.set_xlim(df['LogP'].min()*1.1,df['LogP'].max()*1.1)
ax.set_xlim(-0.07,1.75)
ax.set_xlabel('Log Period')
ax.set_ylabel('Magnitude')
ax.grid(alpha=0.5)
ax.set_title('Cepheid Log Period vs Magnitude')
ax.legend()
plt.show();
There's a greater difference here compared to the analysis from last week; we can see the basic method underestimates for low values and overestimates for high values